# Load the libraries for our analysis
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(stringr)
library(ggplot2)
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(knitr)
library(usmap)
library(glmnet)
## Loading required package: Matrix
## Loaded glmnet 4.1-8
library(viridis)
## Loading required package: viridisLite
library(tidyr)
##
## Attaching package: 'tidyr'
## The following objects are masked from 'package:Matrix':
##
## expand, pack, unpack
# Create some vector for automatic labelization in our graphs
clean_columns_names <- c(
"term" = "Term",
"loan_amnt" = "Loan amount",
"int_rate" = "Interest rate",
"grade" = "Grade",
"sub_grade" = "Sub-grade",
"annual_inc" = "Annual Income",
"loan_status" = "Loan status",
"purpose" = "Purpose",
"us_state" = "U.S. state",
"open_acc" = "Open account",
"total_acc" = "Total account",
"total_pymnt" = "Total payment",
"years_employment" = "Years of employment",
"out_prncp" = "Out principal",
"loan_per_income" = "Loan per income",
"funded_amnt" = "Funded amount"
)
ordered_factor = c(
"term" = FALSE,
"grade" = TRUE,
"sub_grade" = TRUE,
"purpose" = FALSE,
"years_employment" = TRUE,
"us_state" = FALSE,
"politics" = FALSE
)
# Load the dataset
data_loan <- read.csv("../data/loan.csv")
# Get a first impression
kable(summary(data_loan))
| id | member_id | loan_amnt | funded_amnt | funded_amnt_inv | term | int_rate | installment | grade | sub_grade | emp_title | emp_length | home_ownership | annual_inc | verification_status | issue_d | loan_status | pymnt_plan | url | desc | purpose | title | zip_code | addr_state | dti | delinq_2yrs | earliest_cr_line | inq_last_6mths | mths_since_last_delinq | mths_since_last_record | open_acc | pub_rec | revol_bal | revol_util | total_acc | initial_list_status | out_prncp | out_prncp_inv | total_pymnt | total_pymnt_inv | total_rec_prncp | total_rec_int | total_rec_late_fee | recoveries | collection_recovery_fee | last_pymnt_d | last_pymnt_amnt | next_pymnt_d | last_credit_pull_d | collections_12_mths_ex_med | mths_since_last_major_derog | policy_code | application_type | annual_inc_joint | dti_joint | verification_status_joint | acc_now_delinq | tot_coll_amt | tot_cur_bal | open_acc_6m | open_il_6m | open_il_12m | open_il_24m | mths_since_rcnt_il | total_bal_il | il_util | open_rv_12m | open_rv_24m | max_bal_bc | all_util | total_rev_hi_lim | inq_fi | total_cu_tl | inq_last_12m | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Min. : 54734 | Min. : 70473 | Min. : 500 | Min. : 500 | Min. : 0 | Length:887379 | Min. : 5.32 | Min. : 15.67 | Length:887379 | Length:887379 | Length:887379 | Length:887379 | Length:887379 | Min. : 0 | Length:887379 | Length:887379 | Length:887379 | Length:887379 | Length:887379 | Length:887379 | Length:887379 | Length:887379 | Length:887379 | Length:887379 | Min. : 0.00 | Min. : 0.0000 | Length:887379 | Min. : 0.0000 | Min. : 0.0 | Min. : 0.0 | Min. : 0.00 | Min. : 0.0000 | Min. : 0 | Min. : 0.00 | Min. : 1.00 | Length:887379 | Min. : 0 | Min. : 0 | Min. : 0 | Min. : 0 | Min. : 0 | Min. : 0.0 | Min. : 0.0000 | Min. : 0.00 | Min. : 0.000 | Length:887379 | Min. : 0.0 | Length:887379 | Length:887379 | Min. : 0.00000 | Min. : 0.0 | Min. :1 | Length:887379 | Min. : 17950 | Min. : 3.0 | Length:887379 | Min. : 0.000000 | Min. : 0 | Min. : 0 | Min. : 0.0 | Min. : 0.0 | Min. : 0.0 | Min. : 0.0 | Min. : 0.0 | Min. : 0 | Min. : 0.0 | Min. : 0.0 | Min. : 0 | Min. : 0 | Min. : 0.0 | Min. : 0 | Min. : 0.0 | Min. : 0.0 | Min. :-4 | |
| 1st Qu.: 9206643 | 1st Qu.:10877134 | 1st Qu.: 8000 | 1st Qu.: 8000 | 1st Qu.: 8000 | Class :character | 1st Qu.: 9.99 | 1st Qu.: 260.70 | Class :character | Class :character | Class :character | Class :character | Class :character | 1st Qu.: 45000 | Class :character | Class :character | Class :character | Class :character | Class :character | Class :character | Class :character | Class :character | Class :character | Class :character | 1st Qu.: 11.91 | 1st Qu.: 0.0000 | Class :character | 1st Qu.: 0.0000 | 1st Qu.: 15.0 | 1st Qu.: 51.0 | 1st Qu.: 8.00 | 1st Qu.: 0.0000 | 1st Qu.: 6443 | 1st Qu.: 37.70 | 1st Qu.: 17.00 | Class :character | 1st Qu.: 0 | 1st Qu.: 0 | 1st Qu.: 1915 | 1st Qu.: 1900 | 1st Qu.: 1201 | 1st Qu.: 441.5 | 1st Qu.: 0.0000 | 1st Qu.: 0.00 | 1st Qu.: 0.000 | Class :character | 1st Qu.: 280.2 | Class :character | Class :character | 1st Qu.: 0.00000 | 1st Qu.: 27.0 | 1st Qu.:1 | Class :character | 1st Qu.: 76032 | 1st Qu.:13.2 | Class :character | 1st Qu.: 0.000000 | 1st Qu.: 0 | 1st Qu.: 29853 | 1st Qu.: 0.0 | 1st Qu.: 1.0 | 1st Qu.: 0.0 | 1st Qu.: 0.0 | 1st Qu.: 6.0 | 1st Qu.: 10252 | 1st Qu.: 58.6 | 1st Qu.: 0.0 | 1st Qu.: 1 | 1st Qu.: 2411 | 1st Qu.: 47.7 | 1st Qu.: 13900 | 1st Qu.: 0.0 | 1st Qu.: 0.0 | 1st Qu.: 0 | |
| Median :34433267 | Median :37095283 | Median :13000 | Median :13000 | Median :13000 | Mode :character | Median :12.99 | Median : 382.55 | Mode :character | Mode :character | Mode :character | Mode :character | Mode :character | Median : 65000 | Mode :character | Mode :character | Mode :character | Mode :character | Mode :character | Mode :character | Mode :character | Mode :character | Mode :character | Mode :character | Median : 17.65 | Median : 0.0000 | Mode :character | Median : 0.0000 | Median : 31.0 | Median : 70.0 | Median :11.00 | Median : 0.0000 | Median : 11875 | Median : 56.00 | Median : 24.00 | Mode :character | Median : 6458 | Median : 6456 | Median : 4895 | Median : 4862 | Median : 3215 | Median : 1073.3 | Median : 0.0000 | Median : 0.00 | Median : 0.000 | Mode :character | Median : 462.8 | Mode :character | Mode :character | Median : 0.00000 | Median : 44.0 | Median :1 | Mode :character | Median :101771 | Median :17.6 | Mode :character | Median : 0.000000 | Median : 0 | Median : 80559 | Median : 1.0 | Median : 2.0 | Median : 0.0 | Median : 1.0 | Median : 12.0 | Median : 24684 | Median : 74.9 | Median : 1.0 | Median : 2 | Median : 4483 | Median : 61.9 | Median : 23700 | Median : 0.0 | Median : 0.0 | Median : 2 | |
| Mean :32465133 | Mean :35001825 | Mean :14755 | Mean :14742 | Mean :14702 | NA | Mean :13.25 | Mean : 436.72 | NA | NA | NA | NA | NA | Mean : 75028 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | Mean : 18.16 | Mean : 0.3144 | NA | Mean : 0.6946 | Mean : 34.1 | Mean : 70.1 | Mean :11.55 | Mean : 0.1953 | Mean : 16921 | Mean : 55.07 | Mean : 25.27 | NA | Mean : 8403 | Mean : 8400 | Mean : 7559 | Mean : 7521 | Mean : 5758 | Mean : 1754.8 | Mean : 0.3967 | Mean : 45.92 | Mean : 4.881 | NA | Mean : 2164.2 | NA | NA | Mean : 0.01438 | Mean : 44.1 | Mean :1 | NA | Mean :109981 | Mean :18.3 | NA | Mean : 0.004991 | Mean : 226 | Mean : 139458 | Mean : 1.1 | Mean : 2.9 | Mean : 0.8 | Mean : 1.7 | Mean : 20.9 | Mean : 36553 | Mean : 71.5 | Mean : 1.4 | Mean : 3 | Mean : 5888 | Mean : 60.8 | Mean : 32069 | Mean : 0.9 | Mean : 1.5 | Mean : 2 | |
| 3rd Qu.:54908135 | 3rd Qu.:58471347 | 3rd Qu.:20000 | 3rd Qu.:20000 | 3rd Qu.:20000 | NA | 3rd Qu.:16.20 | 3rd Qu.: 572.60 | NA | NA | NA | NA | NA | 3rd Qu.: 90000 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | 3rd Qu.: 23.95 | 3rd Qu.: 0.0000 | NA | 3rd Qu.: 1.0000 | 3rd Qu.: 50.0 | 3rd Qu.: 92.0 | 3rd Qu.:14.00 | 3rd Qu.: 0.0000 | 3rd Qu.: 20829 | 3rd Qu.: 73.60 | 3rd Qu.: 32.00 | NA | 3rd Qu.:13659 | 3rd Qu.:13654 | 3rd Qu.:10617 | 3rd Qu.:10566 | 3rd Qu.: 8000 | 3rd Qu.: 2238.3 | 3rd Qu.: 0.0000 | 3rd Qu.: 0.00 | 3rd Qu.: 0.000 | NA | 3rd Qu.: 831.2 | NA | NA | 3rd Qu.: 0.00000 | 3rd Qu.: 61.0 | 3rd Qu.:1 | NA | 3rd Qu.:132800 | 3rd Qu.:22.6 | NA | 3rd Qu.: 0.000000 | 3rd Qu.: 0 | 3rd Qu.: 208205 | 3rd Qu.: 2.0 | 3rd Qu.: 4.0 | 3rd Qu.: 1.0 | 3rd Qu.: 2.0 | 3rd Qu.: 23.0 | 3rd Qu.: 47858 | 3rd Qu.: 87.6 | 3rd Qu.: 2.0 | 3rd Qu.: 4 | 3rd Qu.: 7772 | 3rd Qu.: 75.2 | 3rd Qu.: 39800 | 3rd Qu.: 1.0 | 3rd Qu.: 2.0 | 3rd Qu.: 3 | |
| Max. :68617057 | Max. :73544841 | Max. :35000 | Max. :35000 | Max. :35000 | NA | Max. :28.99 | Max. :1445.46 | NA | NA | NA | NA | NA | Max. :9500000 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | Max. :9999.00 | Max. :39.0000 | NA | Max. :33.0000 | Max. :188.0 | Max. :129.0 | Max. :90.00 | Max. :86.0000 | Max. :2904836 | Max. :892.30 | Max. :169.00 | NA | Max. :49373 | Max. :49373 | Max. :57778 | Max. :57778 | Max. :35000 | Max. :24205.6 | Max. :358.6800 | Max. :33520.27 | Max. :7002.190 | NA | Max. :36475.6 | NA | NA | Max. :20.00000 | Max. :188.0 | Max. :1 | NA | Max. :500000 | Max. :43.9 | NA | Max. :14.000000 | Max. :9152545 | Max. :8000078 | Max. :14.0 | Max. :40.0 | Max. :12.0 | Max. :19.0 | Max. :363.0 | Max. :878459 | Max. :223.3 | Max. :22.0 | Max. :43 | Max. :127305 | Max. :151.4 | Max. :9999999 | Max. :17.0 | Max. :35.0 | Max. :32 | |
| NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA’s :4 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA’s :29 | NA | NA’s :29 | NA’s :454312 | NA’s :750326 | NA’s :29 | NA’s :29 | NA | NA’s :502 | NA’s :29 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA’s :145 | NA’s :665676 | NA | NA | NA’s :886868 | NA’s :886870 | NA | NA’s :29 | NA’s :70276 | NA’s :70276 | NA’s :866007 | NA’s :866007 | NA’s :866007 | NA’s :866007 | NA’s :866569 | NA’s :866007 | NA’s :868762 | NA’s :866007 | NA’s :866007 | NA’s :866007 | NA’s :866007 | NA’s :70276 | NA’s :866007 | NA’s :866007 | NA’s :866007 |
To simplify computation and make the data set manageable, we only look at the fully paid or defaulted loans making the data set go from 8’000’000 entries to 200’000. In addition, we also reduce the columns to 15 to only keep pertinent entries.
# Focusing only on fully paid and default
data_loan <- filter(data_loan,
loan_status == "Default" | loan_status == "Fully Paid" )
# Select only the important variables
data_loan <- select(data_loan,id,loan_amnt,funded_amnt,term,int_rate,grade,sub_grade,
annual_inc,loan_status,purpose,addr_state,open_acc,total_acc,
total_pymnt,emp_length)
# Get a second impression
kable(summary(data_loan))
| id | loan_amnt | funded_amnt | term | int_rate | grade | sub_grade | annual_inc | loan_status | purpose | addr_state | open_acc | total_acc | total_pymnt | emp_length | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Min. : 54734 | Min. : 500 | Min. : 500 | Length:208942 | Min. : 5.32 | Length:208942 | Length:208942 | Min. : 3000 | Length:208942 | Length:208942 | Length:208942 | Min. : 0.00 | Min. : 2.00 | Min. : 0 | Length:208942 | |
| 1st Qu.: 1443475 | 1st Qu.: 7200 | 1st Qu.: 7200 | Class :character | 1st Qu.:10.16 | Class :character | Class :character | 1st Qu.: 45000 | Class :character | Class :character | Class :character | 1st Qu.: 7.00 | 1st Qu.: 17.00 | 1st Qu.: 7992 | Class :character | |
| Median : 6295801 | Median :12000 | Median :12000 | Mode :character | Median :13.11 | Mode :character | Mode :character | Median : 64000 | Mode :character | Mode :character | Mode :character | Median :10.00 | Median : 24.00 | Median :12909 | Mode :character | |
| Mean :12632478 | Mean :13357 | Mean :13318 | NA | Mean :13.29 | NA | NA | Mean : 74119 | NA | NA | NA | Mean :10.92 | Mean : 25.22 | Mean :15133 | NA | |
| 3rd Qu.:17493205 | 3rd Qu.:18000 | 3rd Qu.:18000 | NA | 3rd Qu.:15.88 | NA | NA | 3rd Qu.: 90000 | NA | NA | NA | 3rd Qu.:14.00 | 3rd Qu.: 32.00 | 3rd Qu.:20480 | NA | |
| Max. :68604659 | Max. :35000 | Max. :35000 | NA | Max. :28.99 | NA | NA | Max. :7141778 | NA | NA | NA | Max. :58.00 | Max. :150.00 | Max. :57778 | NA |
We remove all the entries with no value for the number of years of employment:
# Counting number of cells without any value for the variable year of employment
data_loan |>
select(id,emp_length)|>
group_by(emp_length)|>
summarize( n_distinct(id))|>
ungroup()
## # A tibble: 12 × 2
## emp_length `n_distinct(id)`
## <chr> <int>
## 1 1 year 13987
## 2 10+ years 64123
## 3 2 years 19614
## 4 3 years 16950
## 5 4 years 13488
## 6 5 years 14933
## 7 6 years 12121
## 8 7 years 11554
## 9 8 years 9768
## 10 9 years 7839
## 11 < 1 year 17122
## 12 n/a 7443
# There are 7443 n/a that we want to get rid of
data_loan <- filter(data_loan, data_loan$emp_length != "n/a")
We add the columns politics and loan_per_income
# Creation of the loan_per_income column
data_loan <- mutate(data_loan,
loan_per_income = loan_amnt / annual_inc)
# Adding a field to know whether the state votes democrat or republican
rep <-c("TX","OK","AR","LA","MS","AL","FL","TN","SC","KY","NC","WV","MT","ID",
"WY","UT","AK","ND","SD","IA","NE","KS","MO","IN","OH")
dem <-c("VA","GA","DE","MD","HI","AZ","NM","CO","NV","CA","OR","WA","MN","ME",
"NH","MA","VT","RI","CT","NY","PA","NJ","DC","IL","MI","WI")
data_loan$politics[data_loan$addr_state %in% rep] <- "REP"
data_loan$politics[data_loan$addr_state %in% dem] <- "DEM"
We change a few variables names for easier handling, renamed the ids and made the interests rate actual percentage:
# rename variables
data_loan <-data_loan %>%
rename("us_state"="addr_state",
"years_employment"="emp_length")
# changing ids
data_loan["id"]<-c(str_c("id_",c(1:nrow(data_loan))))
#We want the interest rates values to be percentages
data_loan$int_rate = data_loan$int_rate / 100
We factorize all the categorical data columns:
# get a list of categorical data
factor_names <- c("loan_status", "term", "grade", "sub_grade", "purpose",
"years_employment", "us_state", "politics")
# check that the factor are coherent
for (factor_name in factor_names){
unique_values <- unique(data_loan[, factor_name])
print(unique_values)
}
## [1] "Fully Paid" "Default"
## [1] " 36 months" " 60 months"
## [1] "B" "C" "A" "E" "D" "F" "G"
## [1] "B2" "C5" "C1" "A4" "E1" "C3" "B5" "B1" "D1" "C4" "A1" "B3" "A3" "A5" "B4"
## [16] "D5" "D2" "A2" "E4" "C2" "D4" "F3" "D3" "E3" "F1" "E5" "G4" "E2" "F2" "F5"
## [31] "F4" "G5" "G1" "G3" "G2"
## [1] "credit_card" "small_business" "other"
## [4] "wedding" "car" "debt_consolidation"
## [7] "home_improvement" "major_purchase" "medical"
## [10] "moving" "vacation" "house"
## [13] "renewable_energy" "educational"
## [1] "10+ years" "3 years" "9 years" "5 years" "< 1 year" "4 years"
## [7] "1 year" "6 years" "2 years" "7 years" "8 years"
## [1] "AZ" "IL" "CA" "MO" "CT" "UT" "TX" "FL" "MN" "NY" "NJ" "OR" "KY" "OH" "SC"
## [16] "PA" "RI" "LA" "MA" "WI" "VA" "GA" "AL" "NV" "CO" "MD" "WV" "WA" "VT" "MI"
## [31] "DC" "SD" "NC" "AR" "NM" "KS" "HI" "OK" "AK" "WY" "MT" "NH" "DE" "MS" "TN"
## [46] "IA" "NE" "ID" "IN" "ME" "ND"
## [1] "DEM" "REP"
# factorize them
for (factor_name in factor_names){
data_loan[, factor_name] <- factor(data_loan[, factor_name])
}
A few of the values in our data set are too extreme for proper analysis, so we decided to filter them.
To do so, we filter the 99th quantile of the data,
# Define the vector of continuous values
cont_data_names <- c("loan_amnt", "int_rate", "annual_inc", "total_pymnt",
"loan_per_income", "open_acc", "total_acc")
# Copy the data to see how it changes with the filter
data_old <- data_loan
# filter the 99% percentile
for (cont_data_name in cont_data_names){
threshold <- quantile(data_loan[[cont_data_name]], probs = 0.99)
print(paste0("The data in ", clean_columns_names[cont_data_name], " is filtered with threshold: ", threshold))
extreme_data <- filter(data_loan, !!sym(cont_data_name) >= threshold)
data_loan <- filter(data_loan, !!sym(cont_data_name) < threshold)
}
## [1] "The data in Loan amount is filtered with threshold: 35000"
## [1] "The data in Interest rate is filtered with threshold: 0.245"
## [1] "The data in Annual Income is filtered with threshold: 225000"
## [1] "The data in Total payment is filtered with threshold: 37424.714792132"
## [1] "The data in Loan per income is filtered with threshold: 0.458692179951691"
## [1] "The data in Open account is filtered with threshold: 26"
## [1] "The data in Total account is filtered with threshold: 57"
We then check how the filter affected our data with some boxplots.
# Get a representation of the extremes values filter with some boxplots
for (cont_data_name in cont_data_names){
plt_2 <- ggplot(data_old, aes_string(x = 1, y = cont_data_name )) +
geom_boxplot(outlier.colour = "red", outlier.shape = 1) +
theme_minimal() +
labs(x = clean_columns_names[cont_data_name],
y = "",
title = "Unfiltered Data")
plt_1 <- ggplot(data_loan, aes_string(x = 1, y = cont_data_name )) +
geom_boxplot(outlier.colour = "red", outlier.shape = 1) +
theme_minimal() +
labs(x = clean_columns_names[cont_data_name],
y = "",
title = "Filtered Data")
grid.arrange(plt_1, plt_2, ncol = 2)
}
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Data Analysis
In this section, we foccus on getting a good understanding of our data through visual representation. Due to the large amount of columns in our data set, we choose an approach based on looping to get through the data.
To do so, we begin by defining some functions to plot both our continuous and categorical data.
# Plot functions ---------------------------------------------------------------
# Utils ........................................................................
# Find the angle to write the text for the graph
find_angle <- function(data, factor_name){
n_level <- nlevels(data[, factor_name])
if (n_level < 8){
angle = 0
} else if ( n_level < 15){
angle = 45
} else {
angle = 90
}
return(angle)
}
# Categorical variables functions ..............................................
# Plot the default probability for each categorical variable
plot_cat_relative_default <- function(data, factor_name) {
# Get a summary dataframe
summary <- data |>
group_by(across({{ factor_name }}))|>
summarize(default_probability = mean(loan_status=="Default"),
n = n()) |>
ungroup()
if (!ordered_factor[factor_name]){
summary <- summary[order(summary$default_probability),]
factors <- as.character(summary[[factor_name]])
summary[[factor_name]] <- factor(factors, levels=factors)
}
plt <- ggplot(summary, aes_string(x = "default_probability", y =
factor_name, fill = "default_probability"))+
geom_bar(stat = "identity")+
labs(x = "Percentage of default",
y = clean_columns_names[factor_name],
fill = "Probability of default") +
scale_fill_viridis(direction = -1) +
scale_y_discrete(labels = function(x) gsub("_", " ", x)) +
theme_minimal()
print(plt)
}
# Plot the categorical variables with the default and the paid debt
plot_cat_default <- function(data, factor_name) {
angle <- find_angle(data, factor_name)
plt <- ggplot(data, aes_string(x = factor_name, fill = "loan_status"))+
geom_bar() +
labs(x = clean_columns_names[factor_name],
y = "Count",
fill = "Loan status") +
theme(legend.position = "bottom") +
scale_fill_viridis(discrete=TRUE, direction = -1) +
theme_minimal() +
theme(axis.text.x = element_text(angle = angle, hjust = 1)) +
scale_x_discrete(labels = function(x) gsub("_", " ", x))
print(plt)
}
# Continuous variables functions ...............................................
# Plot the continuous variable histogram of what is paid and what is defaulted
plot_cont_default_n_paid <- function(data, cont_name) {
plt <- ggplot(data, aes_string(x = cont_name, fill = "loan_status"))+
geom_histogram() +
labs(x = clean_columns_names[cont_name],
y = "Count",
fill = "Count") +
scale_fill_viridis(discrete = TRUE, direction = -1) +
theme_minimal()
print(plt)
}
# Plot the defaults for continuous value in absolute terms
plot_cont_default <- function(data, cont_name) {
default_data <- filter(data, loan_status == "Default")
plt <- ggplot(default_data, aes(x = !!sym(cont_name), fill=..count..))+
geom_histogram() +
labs(x = clean_columns_names[cont_name],
y = "Count",
fill = "Count") +
scale_fill_viridis(direction = -1) +
theme_minimal()
print(plt)
}
# Plot the percentage of default for each continuous data
plot_cont_default_percentage <- function(data, column) {
# break down the data in 30 bins, find the percentage of default for each bin
percentage <- data |>
group_by(cut(!!sym(column), breaks = 30)) |>
summarize(default_percentage = mean(loan_status == "Default")) |>
ungroup()
# rename the column for plotting
colnames(percentage) <- c("bins", "default_percentage")
# create a vector for the x-axis
min_x <- min(data[[column]])
max_x <- max(data[[column]])
x_axis <- seq(from = min_x, to = max_x, length.out = nrow(percentage))
# add the x axis to the percentage
percentage$x_axis <- x_axis
# plot the percentage
plt <- ggplot(percentage, aes(x = x_axis, y = default_percentage, fill= default_percentage)) +
geom_bar(stat = "identity") +
labs(x = clean_columns_names[column],
y = "Percentage of default",
fill = "Default percentage") +
scale_fill_viridis(direction = -1) +
theme_minimal()
print(plt)
}
We then loop through all the categorical data to understand how they affect the probability of default.
# Categorical Data plotting ....................................................
cat_data_names <- c("term", "grade", "sub_grade", "purpose",
"years_employment", "us_state")
# General view
for (cat_data in cat_data_names){
plot_cat_default(data_loan, cat_data)
}
# Probability of default
for (cat_data in cat_data_names){
plot_cat_relative_default(data_loan, cat_data)
}
## Warning: There was 1 warning in `group_by()`.
## ℹ In argument: `across(cat_data)`.
## Caused by warning:
## ! Using an external vector in selections was deprecated in tidyselect 1.1.0.
## ℹ Please use `all_of()` or `any_of()` instead.
## # Was:
## data %>% select(cat_data)
##
## # Now:
## data %>% select(all_of(cat_data))
##
## See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
We also loop through all the continuous data to see how they affect the probability of default.
# Continuous Data plotting .....................................................
# General view
for (cont_data in cont_data_names){
plot_cont_default_n_paid(data_loan, cont_data)
}
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Number of defaults
for (cont_data in cont_data_names){
plot_cont_default(data_loan, cont_data)
}
## Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(count)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Default percentage
for (cont_data in cont_data_names) {
plot_cont_default_percentage(data_loan, cont_data)
}
In this part, we want to understand the differences between Republicans and Democrats.
To do so, we use the same approach we used before, that is defining some plotting functions for our categorical and continuous data, and then looping through them with all of our data:
# Functions --------------------------------------------------------------------
# Categorical Data .............................................................
# Plot the absolute distribution for republicans and democrats across all the
# categorical data
plot_cat_pol_default <- function(data, factor_name) {
# find the angle for the x-axis scale
angle <- find_angle(data, factor_name)
# create the graph
plt <- ggplot(data, aes_string(x = factor_name, fill = "politics"))+
geom_bar(position = "dodge")+
labs(x = clean_columns_names[factor_name],
y = "Count") +
theme(legend.position = "bottom") +
scale_fill_manual(values = c("REP" = "red", "DEM" = "blue")) +
theme_minimal() +
theme(axis.text.x = element_text(angle = angle, hjust = 1)) +
scale_x_discrete(labels = function(x) gsub("_", " ", x))
print(plt)
}
# Plot the probability of default for republican or democrats for each
# categorical data
plot_cat_pol_relative_default <- function(data, factor_name) {
summary <- data |>
group_by(across({{ factor_name }}))|>
summarize(DEM = mean(loan_status=="Default" & politics=="REP"),
REP = mean(loan_status=="Default" & politics=="DEM")) |>
ungroup()
summary <- pivot_longer(summary, c("REP", "DEM"),
names_to = "politics", values_to = "default_probability")
# find the angle for the x-axis scale
angle <- find_angle(data, factor_name)
plt <- ggplot(summary, aes(x = !!sym(factor_name), y =default_probability, fill=politics))+
geom_bar(stat = "identity", position = "dodge") +
labs(x = clean_columns_names[factor_name],
y = "Count") +
theme(legend.position = "bottom") +
scale_fill_manual(values = c("REP" = "red", "DEM" = "blue")) +
theme_minimal() +
theme(axis.text.x = element_text(angle = angle, hjust = 1)) +
scale_x_discrete(labels = function(x) gsub("_", " ", x))
print(plt)
}
# Plot The density of the continuous data distribution for both republicans and
# democrats
plot_density_pol <- function(data, factor_name){
# Find the mean for the republicans and the democrats
mean_dem <- mean(data[, factor_name][data$politics == "DEM"])
mean_rep <- mean(data[, factor_name][data$politics == "REP"])
# Set the x position for the mean label
x_range <- max(data[,factor_name]) - min(data[, factor_name])
if (mean_dem < mean_rep){
dem_x_pos <- -(0.1 * x_range)
rep_x_pos <- (0.1 * x_range)
} else {
dem_x_pos <- (0.1 * x_range)
rep_x_pos <- -(0.1 * x_range)
}
# Create the plot
plt <- ggplot(data = data, aes(x = !!sym(factor_name),y=after_stat(scaled), color = politics)) +
geom_density(alpha = 0.5, linewidth = 1.5)+
geom_vline(xintercept = c(mean_dem, mean_rep),
color = c("blue", "red"),
linetype = "dashed", size = 1) +
annotate("text", x = mean_dem + dem_x_pos, y = 0.1,
label = round(mean_dem, 2),
color = "blue", size = 4, vjust = 0) +
annotate("text", x = mean_rep + rep_x_pos, y = 0.1,
label = round(mean_rep, 2),
color = "red", size = 4, vjust = 0) +
labs(x = clean_columns_names[factor_name],y="Density") +
theme_minimal() +
scale_color_manual(values = c("blue", "red"))
print(plt)
}
# Continuous Data ..............................................................
# Plot the continuous variable histogram of what is paid and what is defaulted
plot_cont_pol <- function(data, column) {
plt <- ggplot(data, aes_string(x = column, fill = "politics"))+
geom_histogram(position = "dodge") +
theme_minimal() +
scale_fill_manual(values = c("REP" = "red", "DEM" = "blue")) +
labs(x = clean_columns_names[column],
y = "Count")
print(plt)
}
We then plot all the categorical data along the division line of politics:
# Categorical Data .............................................................
plot_cat_relative_default(data_loan, "politics")
pol_cat_data_names <- c("term", "grade", "sub_grade", "purpose", "years_employment")
for (cat_data in pol_cat_data_names){
plot_cat_pol_default(data_loan, cat_data)
}
for (cat_data in pol_cat_data_names){
plot_cat_pol_relative_default(data_loan, cat_data)
}
And the same for continuous data:
# Continuous Data ..............................................................
for (cont_data in cont_data_names){
plot_cont_pol(data_loan, cont_data)
}
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
for (cont_data in cont_data_names){
plot_density_pol(data_loan, cont_data)
}
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
In this part, we foccus on modelling our data with the use of the glmnet library.
First, we can check the coeficients of correlation of our continuous data:
# Checking the correlation -- --------------------------------------------------
cont_data_names <- c("loan_amnt", "int_rate", "annual_inc", "total_pymnt",
"loan_per_income", "open_acc", "total_acc")
# check for redundant data with correlation
correlation_matrix <- cor(data_loan[cont_data_names])
# Create a heatmap to visualize the resulting correlation matrix.
ggplot(data = data.frame(x = rep(colnames(correlation_matrix), each =
ncol(correlation_matrix)),
y = rep(colnames(correlation_matrix), times =
ncol(correlation_matrix)),
corr = as.vector(correlation_matrix)),
aes(x = x, y = y, fill = corr)) +
geom_tile() +
scale_fill_viridis(direction = 1, limits = c(-1, 1)) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))+
labs(fill = "Correlation") +
scale_x_discrete(labels = function(x) clean_columns_names[x]) +
scale_y_discrete(labels = function(x) clean_columns_names[x]) +
theme(axis.title.x = element_blank(), axis.title.y = element_blank())
Then we use glmnet to model our data:
# Modelling --------------------------------------------------------------------
# Prepare the data
x_data <- data_loan %>%
select(- loan_status, - id)
y_data <- as.numeric(data_loan$loan_status)
# Compute the model using GLMnet
model <- glmnet(x=x_data, y=y_data, alpha = 1)
plot(model)